import pandas as pd
import os
os.makedirs('Data/',exist_ok=True) 
os.listdir('Data')

['title_basics.csv.gz']


# read in your csv 
basics = pd.read_csv('Data/title_basics.csv.gz', low_memory=False)
# drop columns that are deemed unimportant by the stakeholder
basics=basics.drop(columns=["originalTitle","isAdult","titleType","endYear"])                  
basics.head()


# create individual rows, each with a single genre
basics['genres_split'] = basics['genres'].str.split(',')
exploded_genres = basics.explode('genres_split')
exploded_genres


# sanitycheck to make sure the explode worked
unique_genres = sorted(exploded_genres['genres_split'].unique())
unique_genres

['Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']


title_genres=exploded_genres[['tconst','genres_split']].copy()
title_genres.head()


genre_id_map=dict(zip(unique_genres,range(len(unique_genres))))
genre_id_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}


title_genres['genre_id'] = title_genres['genres_split'].map(genre_id_map)
title_genres=title_genres.drop(columns='genres_split')
genre_map=pd.DataFrame({'genre_name':genre_id_map.keys(),
                    'genre_id':genre_id_map.values()})
genre_map.head()


#col no longer needed for loading into sql after transformations
basics=basics.drop("genres_split",axis=1)


# verify  all columns are how you would like them to be exported into MySQL
basics.head()


import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
#from sqlalchemy_utils import create_database, database_exists

connection_str = "mysql+pymysql://root:root@localhost/Movies"
engine = create_engine(connection_str)


title_genres.to_sql('title_genres',engine,if_exists='replace',index=False)

q = """SELECT *FROM title_genres LIMIT 5;"""
pd.read_sql(q, engine)


genre_map.to_sql('genres',engine,if_exists='replace',index=False)

q = """SELECT *FROM genres LIMIT 5;"""
pd.read_sql(q, engine)


basics.dtypes

tconst             object
primaryTitle       object
startYear         float64
runtimeMinutes      int64
genres             object
dtype: object


from sqlalchemy.types import *

key_len = basics['tconst'].fillna('').map(len).max()
title_len = basics['primaryTitle'].fillna('').map(len).max()
gen_len= basics['genres'].fillna('').map(len).max()
df_schema_basics = {
    "tconst": String(key_len+1), 
    "primaryTitle": Text(title_len+1),
    "startYear":Float(),
    "runtimeMinutes":Integer(),
    "genres":Text(gen_len+1)}

basics.to_sql('title_basics',engine,dtype=df_schema_basics,if_exists='replace',index=False)
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1adb809d310>


q = """SELECT *FROM title_basics LIMIT 5;"""
pd.read_sql(q, engine)

	tconst	primaryTitle	startYear	runtimeMinutes	genres
0	tt0035423	Kate & Leopold	2001.0	118	Comedy,Fantasy,Romance
1	tt0062336	The Tango of the Widower and Its Distorting Mi...	2020.0	70	Drama
2	tt0068865	Lives of Performers	2016.0	90	Drama
3	tt0069049	The Other Side of the Wind	2018.0	122	Drama
4	tt0088751	The Naked Monster	2005.0	100	Comedy,Horror,Sci-Fi

	tconst	primaryTitle	startYear	runtimeMinutes	genres	genres_split
0	tt0035423	Kate & Leopold	2001.0	118	Comedy,Fantasy,Romance	Comedy
0	tt0035423	Kate & Leopold	2001.0	118	Comedy,Fantasy,Romance	Fantasy
0	tt0035423	Kate & Leopold	2001.0	118	Comedy,Fantasy,Romance	Romance
1	tt0062336	The Tango of the Widower and Its Distorting Mi...	2020.0	70	Drama	Drama
2	tt0068865	Lives of Performers	2016.0	90	Drama	Drama
...	...	...	...	...	...	...
82003	tt9916190	Safeguard	2020.0	95	Action,Adventure,Thriller	Action
82003	tt9916190	Safeguard	2020.0	95	Action,Adventure,Thriller	Adventure
82003	tt9916190	Safeguard	2020.0	95	Action,Adventure,Thriller	Thriller
82004	tt9916362	Coven	2020.0	92	Drama,History	Drama
82004	tt9916362	Coven	2020.0	92	Drama,History	History

	tconst	primaryTitle	startYear	runtimeMinutes	genres
0	tt0035423	Kate & Leopold	2001.0	118	Comedy,Fantasy,Romance
1	tt0062336	The Tango of the Widower and Its Distorting Mi...	2020.0	70	Drama
2	tt0068865	Lives of Performers	2016.0	90	Drama
3	tt0069049	The Other Side of the Wind	2018.0	122	Drama
4	tt0088751	The Naked Monster	2005.0	100	Comedy,Horror,Sci-Fi

	tconst	genre_id
0	tt0035423	5
1	tt0035423	9
2	tt0035423	18
3	tt0062336	7
4	tt0068865	7

	tconst	primaryTitle	startYear	runtimeMinutes	genres
0	tt0035423	Kate & Leopold	2001.0	118	Comedy,Fantasy,Romance
1	tt0062336	The Tango of the Widower and Its Distorting Mi...	2020.0	70	Drama
2	tt0068865	Lives of Performers	2016.0	90	Drama
3	tt0069049	The Other Side of the Wind	2018.0	122	Drama
4	tt0088751	The Naked Monster	2005.0	100	Comedy,Horror,Sci-Fi

Exporting Data to MySQl¶

Required Transformation Steps for Title Basics:¶

Create DB in mySQL & load tables¶

	genre_name	genre_id
0	Action	0
1	Adult	1
2	Adventure	2
3	Animation	3
4	Biography	4

	genre_name	genre_id
0	Action	0
1	Adult	1
2	Adventure	2
3	Animation	3
4	Biography	4